packages <- c("devtools","knitr","tidyverse","widgetframe","readr",
              "wordcloud", "base64enc", "tidytext", 
              "RWeka","stats","manifestoR","readtext",
              "rvest", "stringr", 
              "SnowballC", "plotrix", "tidyr", "tidytext", "stats", 
              "dendextend", "ggthemes",
              "httr","jsonlite", "DT", "textdata", "ggmap","maptools","mapproj","rgeos","rgdal",
              "RColorBrewer", "stringr","scales", "leaflet", 'leafpop', "ggthemes", "ggtext", "wordcloud")

packages <- lapply(packages, FUN = function(x) {
  if(!require(x, character.only = TRUE)) {
    install.packages(x)
  library(x, character.only = TRUE)
  }
}
)

Overview

For this project, we explored reported side effects of the covid-19 vaccine, focusing on adverse reactions reported from 2020-12-01 to 2021-3-31. The visualization aims to provide an insight into who are the people reporting side effects and how do they compare to the general population; the most common reported symptoms etc. We also analyzed tweets associated with the covid-19 vaccine to see people’s attitudes on the vaccine.

Data

Who Are the People Reporting Side Effects ?

Age and Gender

Do elders suffer more from side effects ? Not exactly

vac <- read_csv("part1data/2021VAERSDATA.csv")
sym <- read_csv("part1data/2021VAERSSYMPTOMS.csv")
vax <- read_csv("part1data/2021VAERSVAX.csv")

#convert vaccination date
vac$VAX_DATE <- as.Date(vac$VAX_DATE, format = "%m/%d/%Y") 

#merge the first two csv file by patient ID
merge1 <- left_join(vac, sym, by = "VAERS_ID")
#merge the third csv together
merge <- left_join(merge1, vax, by = "VAERS_ID")

#filter for COVID19 vaccine
covid <- merge %>% 
  filter(VAX_TYPE == "COVID19")

#find out patient's age and gender
agesex <- covid %>% 
  distinct(VAERS_ID,.keep_all = TRUE) %>% #distinct patients
  select(AGE_YRS, SEX, VAX_DATE) %>% 
  filter(AGE_YRS >= 18 & AGE_YRS != 'NA', VAX_DATE >= "2020-12-01" & SEX != "U" ) #filter for age over 18, vaccination data after 2020-12-01 and filter out unknown value for sex

as <- agesex %>% 
  group_by(SEX,AGE_YRS) %>% 
  count(SEX,AGE_YRS)
as
## # A tibble: 176 x 3
## # Groups:   SEX, AGE_YRS [176]
##    SEX   AGE_YRS     n
##    <chr>   <dbl> <int>
##  1 F          18    31
##  2 F          19    74
##  3 F          20    92
##  4 F          21   115
##  5 F          22   169
##  6 F          23   190
##  7 F          24   231
##  8 F          25   316
##  9 F          26   321
## 10 F          27   339
## # … with 166 more rows
ggplot(as, aes(x = AGE_YRS, y = n, color = SEX))+
  geom_line(size = 1)+ 
  labs(title = "Covid Vaccine Side Effects Reported Based on Age and Sex<br> By Mar 31, 2021",x="Age",y = "Number of People Reported") + 
  scale_x_continuous(breaks = seq(20,110,10), limits = c(15, 110)) + scale_y_continuous(breaks = seq(0,600,200), limits = c(0, 600))+
  theme_minimal()  + theme(plot.title = element_markdown(hjust = 0.5),legend.title = element_blank()) + scale_color_manual(values = c("#fdb863","#2d004b"),labels = c("Female", "Male"))

We could see that in general, women and younger people seem to suffer more from side effects. As age increased, the report number actually decreased, especially for women. Do elders suffer more from side effects? Not exactly. Is it possible there are fewer elders who got vaccinated thus fewer reports? We decided to dive deeper into who got vaccinated by looking at different age groups.

Vaccinated rate by different age group

#vaccinated number by age group from cdc
agegroup <- read_csv("part1data/demographic_trends_of_people_receiving_covid19_vaccinations_in_the_united_states.csv")
agegroup$Date <- as.Date(agegroup$Date, format = "%Y/%m/%d") 
ageg <- agegroup %>% 
  filter(Date <= "2021-03-31") %>% #filter for date before 2021/03/31
  filter(`Age Group` != "Age_unknown" & `Age Group` != "Age_known" & `Age Group` != "Ages_<18yrs") %>%  #filter for age > 18
  select(Date, `Age Group`, `People with at least one dose`, `Percent of age group with at least one dose`, Census)

ageg$Age <- with(ageg, ifelse(`Age Group` == "Ages_18-29_yrs", "18-29",
                              ifelse(`Age Group` == "Ages_30-39_yrs", "30-39",
                                            ifelse(`Age Group` == "Ages_40-49_yrs", "40-49",
                                                   ifelse(`Age Group` == "Ages_50-64_yrs", "50-64",
                                                          ifelse(`Age Group` == "Ages_65-74_yrs", "65-74",
                                                                 ifelse(`Age Group` == "Ages_75+_yrs", "75+", "other")))))))
ageg
## # A tibble: 636 x 6
##    Date       `Age Group`  `People with at le… `Percent of age gro… Census Age  
##    <date>     <chr>                      <dbl>                <dbl>  <dbl> <chr>
##  1 2021-03-31 Ages_75+_yrs            15827616                 74.1 2.14e7 75+  
##  2 2021-03-30 Ages_75+_yrs            15755182                 73.8 2.14e7 75+  
##  3 2021-03-29 Ages_75+_yrs            15717469                 73.6 2.14e7 75+  
##  4 2021-03-28 Ages_75+_yrs            15663561                 73.4 2.14e7 75+  
##  5 2021-03-27 Ages_75+_yrs            15575262                 72.9 2.14e7 75+  
##  6 2021-03-26 Ages_75+_yrs            15462377                 72.4 2.14e7 75+  
##  7 2021-03-25 Ages_75+_yrs            15336407                 71.8 2.14e7 75+  
##  8 2021-03-24 Ages_75+_yrs            15231705                 71.3 2.14e7 75+  
##  9 2021-03-23 Ages_75+_yrs            15131315                 70.9 2.14e7 75+  
## 10 2021-03-22 Ages_75+_yrs            15049439                 70.5 2.14e7 75+  
## # … with 626 more rows
 g <-  ggplot() + geom_line(data = ageg, aes(x = Date, y =`Percent of age group with at least one dose`, color = Age))+ scale_color_brewer(palette = "PuOr") +scale_y_continuous(breaks = seq(0,100,25), limits = c(0,100))+labs(title = "Percentage of People that Have Received at Least One Dose of Cov Vaccine<br>by Age Group", subtitle = "2020/12/16 - 2021/3/31",x="",y = "") +  theme_minimal() + theme(plot.title = element_markdown(hjust=0.5)) + theme(legend.position = "top", legend.title = element_text(size = 8)) +  theme(plot.subtitle=element_text(hjust=0.5))  + scale_y_continuous(labels = c("0" = "0", "25" = "25%", "50" = "50%", "75" = "75%", "100" = "100%")) + guides(colour = guide_legend(nrow = 1)) 
  #, labels = c("18-29",  "30-39",  "40-49", "50-64",  "65-74", "75+"

library(plotly)
ggplotly(g) %>% layout(title = paste0("Vaccinated Rate by Different Age Group", "<br>","2020/12/16- 2021/3/31"))

By mar 31, 2021, more than 75% of people over 65 had received at least one dose of covid-19 vaccine; a much higher percentage than younger group.

Report rate by different age group

# report num
yrs <- as %>% 
  group_by(AGE_YRS) %>% 
  summarise(sum = sum(n))
yrs$group <- with(yrs, ifelse(AGE_YRS >= 18 & AGE_YRS <= 29, "Ages_18-29_yrs",
                              ifelse(AGE_YRS >= 30 & AGE_YRS <= 39, "Ages_30-39_yrs",
                                            ifelse(AGE_YRS >= 40 & AGE_YRS <= 49, "Ages_40-49_yrs",
                                                   ifelse(AGE_YRS >= 50 & AGE_YRS <= 64, "Ages_50-64_yrs",
                                                          ifelse(AGE_YRS >= 65 & AGE_YRS <= 74, "Ages_65-74_yrs",
                                                                 ifelse(AGE_YRS >= 75, "Ages_75+_yrs", "other")))))))
# report cases by different age group
reportbygroup <- yrs %>% 
  group_by(group) %>% 
  summarise(reportnumber = sum(sum))

#join vaccinated cases to calculate report rate
report <- left_join(reportbygroup,ageg, by = c("group" = "Age Group"))
reportrate <-  report %>% 
  filter(Date == "2021-03-31") %>% # vaccinated number until 2021/03/31
  select(group, reportnumber,`People with at least one dose`) %>%
  mutate(Rate = round(reportnumber*10000 / `People with at least one dose`,2)) #rate

reportrate
## # A tibble: 6 x 4
##   group          reportnumber `People with at least one dose`  Rate
##   <chr>                 <int>                           <dbl> <dbl>
## 1 Ages_18-29_yrs         3276                         7819799  4.19
## 2 Ages_30-39_yrs         5911                         9734388  6.07
## 3 Ages_40-49_yrs         5612                        10759693  5.22
## 4 Ages_50-64_yrs         7461                        23910182  3.12
## 5 Ages_65-74_yrs         3798                        21723090  1.75
## 6 Ages_75+_yrs           3978                        15827616  2.51
ggplot()+
  geom_col(data = reportrate, aes(x = group, y = Rate, fill = group)) + scale_fill_brewer(palette = "PuOr")+
  labs(title = "Number of Reported Side Effects Cases per10k Vaccinated People<br> by Mar 31, 2021",x="Age",y = "Number of Reported Cases") + 
  scale_x_discrete(labels = c("Ages_18-29_yrs" = "18-29", "Ages_30-39_yrs" = "30-39", "Ages_40-49_yrs" = "40-49", "Ages_50-64_yrs" = "50-64", "Ages_65-74_yrs" = "65-74", "Ages_75+_yrs" = "75+")) + 
  theme_minimal()   +
  theme(plot.title = element_markdown(hjust = 0.5),legend.position = "none") 

By mar31,2021, for people over 75 years old, fewer than 3 of every 10,000 people who received at least one dose of COVID-19, reported adverse effects. In contrast, those aged 30 to 39 reported the highest rate of adverse effects, at 6 out of every 10, 000 people who received the vaccine. So, counter-intuitively, it seems that elders are less vulnerable to side effects. Some articles suggest that the immune response may actually be stronger in the younger group, so side effects may be more pronounced for the younger. The results of our data seem to confirm this. However, there may be other reasons for the lower rate of reported side effects in the elderly group, such as the elderly group is not as likely to use computers as the younger group, which affects the reporting rate, etc

Pre-illness

Medical history and Pre-illness are also strong indicators to predict suitable candidates for covid-19 vaccines. So we decided to look at the most common pre-illness of these people who reported adverse reactions.

his <- vac %>% 
  filter(AGE_YRS >= 18 & AGE_YRS != 'NA', VAX_DATE >= "2020-12-01" & SEX != "U" ) %>% #filter for age >18 , vaccination data after 2020-12-01 and filter out unknown value for sex
  filter(HISTORY != "None" & HISTORY != "NA" & HISTORY !="N/A" & HISTORY != "n/a" & HISTORY !="no" & HISTORY != "none") %>% 
  select(VAERS_ID,HISTORY) %>% 
  rename(doc_id = VAERS_ID, text = HISTORY) 

#cleaning
new_stops <- c("no", "history","medical", "conditions", "historyconcurrent", "disease","patient", "relevant","chronic","none", stopwords("en"))

his$text <- iconv(his$text, "UTF-8", "UTF-8",sub='')
his$text = removePunctuation(his$text)
his$text=tolower(his$text)
his$text=removeWords(his$text, new_stops)
his$text=removeNumbers(his$text)
  
require(RWeka)
# Make tokenizer function 
tokenizer <- function(x) 
  NGramTokenizer(x, Weka_control(min = 1, max = 3))

hisd <- data.frame("history" = tokenizer(his$text))
hisd <- hisd %>% 
  group_by(history) %>% 
  count(history) %>% 
  arrange(desc(n)) 
#write.csv(hisd, "/Users/hannahz/Desktop/data visualization\\history.csv")
#after counting, I manually filtered most common 50 pre-illness. I combind the same illness (hypertension, high blood pressure)
history <- read_csv("part1data/pre-illness_history_clean.csv")

purple_orange <- brewer.pal(10, "RdYlBu")
set.seed(2103)
wordcloud(history$history, history$n,
         max.words = 50, colors = purple_orange)

Hypertension, asthma, and diabetes are the most common pre-illness mentioned by people reporting side effects. One thought is that respondents with those diseases might be more vulnerable to vaccine side effects. However, Is it possible that those symptoms reported are actually caused by pre-illness but rather than vaccines? People might associate health problems that would have happened anyway with the vaccines. We haven’t come up with a more accurate way to describe this relationship but it is worth exploring.

When Did Side Effects Kick in ?

nums <- vac %>% 
  filter(AGE_YRS >= 18 & AGE_YRS != 'NA', VAX_DATE >= "2020-12-01" & SEX != "U" & NUMDAYS != "NA") %>%
  select(SEX,AGE_YRS,NUMDAYS) 

nums$group <- with(nums, ifelse(AGE_YRS >= 18 & AGE_YRS <= 29, "18-29",
                              ifelse(AGE_YRS >= 30 & AGE_YRS <= 39, "30-39",
                                            ifelse(AGE_YRS >= 40 & AGE_YRS <= 49, "40-49",
                                                   ifelse(AGE_YRS >= 50 & AGE_YRS <= 64, "50-64",
                                                          ifelse(AGE_YRS >= 65 & AGE_YRS <= 74, "65-74",
                                                                 ifelse(AGE_YRS >= 75, "75+", "other")))))))

numbyages <- nums %>% 
  group_by(SEX,group) %>% 
  summarise(mean = mean(NUMDAYS))

numbyages
## # A tibble: 12 x 3
## # Groups:   SEX [2]
##    SEX   group  mean
##    <chr> <chr> <dbl>
##  1 F     18-29  1.94
##  2 F     30-39  2.37
##  3 F     40-49  2.32
##  4 F     50-64  2.33
##  5 F     65-74  3.12
##  6 F     75+    4.69
##  7 M     18-29  1.58
##  8 M     30-39  2.33
##  9 M     40-49  2.33
## 10 M     50-64  2.59
## 11 M     65-74  3.74
## 12 M     75+    5.58
ggplot(numbyages, aes(x = group, y = mean, fill = SEX)) + geom_col(position = "dodge", width = 0.6) + labs(title = "Side Effects Average Onset Days After Vaccination", x ="Age", y = "Number of Days after Vaccination") +
  theme_minimal()  + scale_y_continuous(breaks = seq(1,6,1), limits = c(0,6)) + theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank())+scale_fill_manual(values = c("#fdb863","#2d004b"),labels = c("Female", "Male"))

In general, the younger group seems to be more sensitive about side effects; they reported symptoms under 3 days after vaccination. Elders tend to be slower in feeling the symptoms. Side effects tend to kick in early for males in the younger group. In the elder group, on the contrary, females seem to suffer earlier from the symptoms.

What Are the Symptoms?

Top 10 common symptoms

#select symptoms out
sympword  <-  covid %>% 
  select(SYMPTOM1, SYMPTOM2, SYMPTOM3, SYMPTOM4, SYMPTOM5)
# list all the symptoms out in a column
symd1 <- data.frame(symptom=unlist(sympword, use.names = FALSE))
#filter out na value
symd1 <- na.omit(symd1)

common10 <- symd1 %>% 
  group_by(symptom) %>% 
  count(symptom) %>%
  arrange(desc(n)) %>% 
  ungroup() %>% 
  mutate(rank = row_number()) %>% 
  filter(rank <= 10)

ggplot(common10) + geom_col(aes(x = n, y = reorder(symptom, n)), fill ="#fdd49e", width = 0.7) + scale_x_continuous(name = "Number of Symptom Reported", breaks = seq(0,8000,1000)) + labs(title = "Top10 Side Effect Symptoms", y = "")+theme_minimal() + theme_minimal() +  theme(plot.title = element_text(hjust = 0.5))

Emotional words in symptoms description

senti <- vac %>% 
  filter(AGE_YRS >= 18 & AGE_YRS != 'NA', VAX_DATE >= "2020-12-01" & SEX != "U") %>%
  select(VAERS_ID,SYMPTOM_TEXT) %>% 
  rename(doc_id = VAERS_ID, text = SYMPTOM_TEXT)
senti$text <- iconv(senti$text, "UTF-8", "UTF-8",sub='')

#convert to a df corpus
df_source <- DataframeSource(senti)
df_corpus <- VCorpus(df_source)
df_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 30095
#filter out  neutral words
new_stops <- c("can", "make", "patient", "vaccine", "received","included","medical","experienced", "outcome", "feeling","receiving", "time", "female", "information", "immunization","shot", "nurse","doctor", "action" ," including","noted","shoulder","physician","visit","reporter", "ethics", "resident",stopwords("en"))

#clean corpus
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords, c(new_stops))
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}

cleancor <- clean_corpus(df_corpus)

top_dtm <- DocumentTermMatrix(cleancor)

# convert to tidytext
top_td <- tidy(top_dtm)

#using NRC 
# I had some difficulty connecting to the tidytext package and downloading the NRC(probabily because I'm using a vpn) , so I downloaded manually 
txt <- read.table("part1data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
txt <- txt %>% 
  filter(V3 != 0) %>% 
  select(V1, V2)
nrc <- dplyr::rename(txt, word = V1, sentiment = V2)


emotion <- top_td %>% 
  inner_join(nrc, by = c(term = "word")) %>% 
  group_by(sentiment) %>% 
  count(term) %>% 
  arrange(desc(n)) %>% 
  mutate(rank = row_number()) %>% 
  mutate(number =  n/1000) %>% 
  filter(rank <= 10)
emotion
## # A tibble: 100 x 5
## # Groups:   sentiment [10]
##    sentiment    term          n  rank number
##    <chr>        <chr>     <int> <int>  <dbl>
##  1 fear         injection  7068     1   7.07
##  2 fear         pain       6282     2   6.28
##  3 negative     pain       6282     1   6.28
##  4 sadness      pain       6282     1   6.28
##  5 negative     headache   6219     2   6.22
##  6 fear         fever      5571     3   5.57
##  7 negative     fatigue    3588     3   3.59
##  8 disgust      nausea     3215     1   3.22
##  9 negative     nausea     3215     4   3.22
## 10 anticipation unknown    2966     1   2.97
## # … with 90 more rows
  ggplot(data = emotion, aes(reorder(term, number), number, fill=sentiment)) +geom_bar(stat="identity", show.legend = FALSE)  +scale_y_continuous(breaks = seq(0,6,2), labels = c("0" = "0", "2" = "2k", "4" = "4k", "6" = "6k"))+facet_wrap(~sentiment, scales="free_y", ncol=5) +labs(y = "Number of Times Expressed", x = NULL, title = "Emotion Words Expressed in Symptoms Description") +coord_flip()   + scale_fill_brewer(palette = "Paired")  + theme(plot.title = element_text(hjust = 0.5))

Negative, disgust, fear, sadness are the most frequently expressed emotions in the symptoms text, which is reasonable because people were experiencing uncomfortable feelings in their bodies.